1. Read and Analyse Dataset. Clearly write outcome of data analysis¶

In [33]:
print("***1.A. Clean the Structured Data***")
***1.A. Clean the Structured Data***
In [36]:
print('#1.A.I. Missing value analysis and imputation. After checking for missing values, Write a function to plot the missing values in each column.')
#1.A.I. Missing value analysis and imputation. After checking for missing values, Write a function to plot the missing values in each column.
In [ ]:
import pandas as pd
In [4]:
blog=pd.read_csv(r'G:\AIML Course Materials\Projects\NLP_Additional Project\TheSocialDilemma.csv')
In [6]:
blog.shape
Out[6]:
(20068, 14)
In [14]:
blog.head()
Out[14]:
user_name user_location user_description user_created user_followers user_friends user_favourites user_verified date text hashtags source is_retweet Sentiment
0 Mari Smith San Diego, California Premier Facebook Marketing Expert | Social Med... 2007-09-11 22:22:51 579942 288625 11610 False 2020-09-16 20:55:33 @musicmadmarc @SocialDilemma_ @netflix @Facebo... NaN Twitter Web App False Neutral
1 Mari Smith San Diego, California Premier Facebook Marketing Expert | Social Med... 2007-09-11 22:22:51 579942 288625 11610 False 2020-09-16 20:53:17 @musicmadmarc @SocialDilemma_ @netflix @Facebo... NaN Twitter Web App False Neutral
2 Varun Tyagi Goa, India Indian | Tech Solution Artist & Hospitality Ex... 2009-09-06 10:36:01 257 204 475 False 2020-09-16 20:51:57 Go watch “The Social Dilemma” on Netflix!\n\nI... NaN Twitter for iPhone False Positive
3 Casey Conway Sydney, New South Wales Head of Diversity & Inclusion @RugbyAU | It's ... 2012-12-28 21:45:06 11782 1033 12219 True 2020-09-16 20:51:46 I watched #TheSocialDilemma last night. I’m sc... ['TheSocialDilemma'] Twitter for iPhone False Negative
4 Charlotte Paul Darlington Instagram Charlottejyates 2012-05-28 20:43:08 278 387 5850 False 2020-09-16 20:51:11 The problem of me being on my phone most the t... ['TheSocialDilemma'] Twitter for iPhone False Positive
In [15]:
blog.tail()
Out[15]:
user_name user_location user_description user_created user_followers user_friends user_favourites user_verified date text hashtags source is_retweet Sentiment
20063 scp. NaN “Through love, all is possible.” - SJM - See m... 2013-02-19 00:55:12 431 193 32958 False 2020-10-09 00:25:53 #TheSocialDilemma yalll.... this shit... we kn... ['TheSocialDilemma'] Twitter for iPhone False Negative
20064 Dono6971 United States Father, Husband, and a Dude|| Love Notre Dame ... 2010-01-06 04:08:41 172 96 50159 False 2020-10-09 00:24:45 Peeps:\n\nFind 90 minutes this weekend and wat... NaN Twitter for iPhone False Positive
20065 Remi Shores NaN Genderfluid / They/Them/Theirs / Queer Christi... 2012-05-16 23:49:13 387 652 7885 False 2020-10-09 00:11:42 So you watched #thesocialdilemma, or have been... ['thesocialdilemma'] Twitter Web App False Negative
20066 Scott the Great and Terrible NaN I can't recall the taste of food, nor the soun... 2020-03-16 18:20:31 103 84 2976 False 2020-10-09 00:10:16 Good social media advice:\n\nChoose the thing ... ['TheSocialDilemma'] Twitter Web App False Positive
20067 Get Outside Media Telluride, CO CREATIVE AGENCY | BRAND + CONTENT + DESIGN + P... 2018-07-14 04:44:23 133 898 1131 False 2020-10-09 00:00:31 Boulder director Jeff Orlowski hopes viewers o... ['TheSocialDilemma'] Hootsuite Inc. False Neutral
In [12]:
blog.isnull().sum()
Out[12]:
user_name              1
user_location       4208
user_description    1383
user_created           0
user_followers         0
user_friends           0
user_favourites        0
user_verified          0
date                   0
text                   0
hashtags            4297
source                 0
is_retweet             0
Sentiment              0
dtype: int64
In [27]:
# Missing values found in columns user_name,user_location, user_description & hashtags. These columns needs to be imputed to get rid of missing values
In [19]:
dt=blog.copy()
In [20]:
dt.shape
Out[20]:
(20068, 14)
In [21]:
dt.head()
Out[21]:
user_name user_location user_description user_created user_followers user_friends user_favourites user_verified date text hashtags source is_retweet Sentiment
0 Mari Smith San Diego, California Premier Facebook Marketing Expert | Social Med... 2007-09-11 22:22:51 579942 288625 11610 False 2020-09-16 20:55:33 @musicmadmarc @SocialDilemma_ @netflix @Facebo... NaN Twitter Web App False Neutral
1 Mari Smith San Diego, California Premier Facebook Marketing Expert | Social Med... 2007-09-11 22:22:51 579942 288625 11610 False 2020-09-16 20:53:17 @musicmadmarc @SocialDilemma_ @netflix @Facebo... NaN Twitter Web App False Neutral
2 Varun Tyagi Goa, India Indian | Tech Solution Artist & Hospitality Ex... 2009-09-06 10:36:01 257 204 475 False 2020-09-16 20:51:57 Go watch “The Social Dilemma” on Netflix!\n\nI... NaN Twitter for iPhone False Positive
3 Casey Conway Sydney, New South Wales Head of Diversity & Inclusion @RugbyAU | It's ... 2012-12-28 21:45:06 11782 1033 12219 True 2020-09-16 20:51:46 I watched #TheSocialDilemma last night. I’m sc... ['TheSocialDilemma'] Twitter for iPhone False Negative
4 Charlotte Paul Darlington Instagram Charlottejyates 2012-05-28 20:43:08 278 387 5850 False 2020-09-16 20:51:11 The problem of me being on my phone most the t... ['TheSocialDilemma'] Twitter for iPhone False Positive
In [22]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

def plot_missing_values(df):
    # Calculating % of missing values in each column
    percent_missing = dt.isnull().sum() / len(dt) * 100

    # Create a DataFrame with column names and missing percentages
    missing_data = pd.DataFrame({'Column': dt.columns, 'Missing %': percent_missing})

    # Sort the DataFrame by missing percentage in descending order
    missing_data = missing_data.sort_values('Missing %', ascending=False)

    # Plot the missing values using a bar plot
    plt.figure(figsize=(12, 6))
    sns.barplot(x='Column', y='Missing %', data=missing_data)
    plt.xticks(rotation=90)
    plt.xlabel('Columns')
    plt.ylabel('Missing Percentage')
    plt.title('Missing Values in Each Column')
    plt.show()
In [23]:
plot_missing_values(dt)
In [29]:
#Imputing missing values
In [25]:
# Impute missing values in 'hashtags' column with 'NoHashtag'
dt['hashtags'].fillna('NoHashtag', inplace=True)

# Impute missing values in 'user_location' column with a default value
dt['user_location'].fillna('Unknown', inplace=True)

# Impute missing values in 'user_description' column with an empty string
dt['user_description'].fillna('', inplace=True)

# Impute missing values in 'user_name' column with a default value
dt['user_name'].fillna('Unknown', inplace=True)
In [28]:
dt.isnull().sum()
Out[28]:
user_name           0
user_location       0
user_description    0
user_created        0
user_followers      0
user_friends        0
user_favourites     0
user_verified       0
date                0
text                0
hashtags            0
source              0
is_retweet          0
Sentiment           0
dtype: int64
In [38]:
print('1.A.II. Eliminate Non-English textual data.')
1.A.II. Eliminate Non-English textual data.
In [39]:
!pip install langdetect
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
Requirement already satisfied: six in f:\anaconda3\lib\site-packages (from langdetect) (1.16.0)
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py): started
  Building wheel for langdetect (setup.py): finished with status 'done'
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993242 sha256=8becf78419c9c23c7dc8c660ea740b89760dc47e62d846f5b56eea89e87fde94
  Stored in directory: c:\users\richard\appdata\local\pip\cache\wheels\d1\c1\d9\7e068de779d863bc8f8fc9467d85e25cfe47fa5051fff1a1bb
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9
In [41]:
dt.head()
Out[41]:
user_name user_location user_description user_created user_followers user_friends user_favourites user_verified date text hashtags source is_retweet Sentiment
0 Mari Smith San Diego, California Premier Facebook Marketing Expert | Social Med... 2007-09-11 22:22:51 579942 288625 11610 False 2020-09-16 20:55:33 @musicmadmarc @SocialDilemma_ @netflix @Facebo... NoHashtag Twitter Web App False Neutral
1 Mari Smith San Diego, California Premier Facebook Marketing Expert | Social Med... 2007-09-11 22:22:51 579942 288625 11610 False 2020-09-16 20:53:17 @musicmadmarc @SocialDilemma_ @netflix @Facebo... NoHashtag Twitter Web App False Neutral
2 Varun Tyagi Goa, India Indian | Tech Solution Artist & Hospitality Ex... 2009-09-06 10:36:01 257 204 475 False 2020-09-16 20:51:57 Go watch “The Social Dilemma” on Netflix!\n\nI... NoHashtag Twitter for iPhone False Positive
3 Casey Conway Sydney, New South Wales Head of Diversity & Inclusion @RugbyAU | It's ... 2012-12-28 21:45:06 11782 1033 12219 True 2020-09-16 20:51:46 I watched #TheSocialDilemma last night. I’m sc... ['TheSocialDilemma'] Twitter for iPhone False Negative
4 Charlotte Paul Darlington Instagram Charlottejyates 2012-05-28 20:43:08 278 387 5850 False 2020-09-16 20:51:11 The problem of me being on my phone most the t... ['TheSocialDilemma'] Twitter for iPhone False Positive
In [42]:
dt.columns
Out[42]:
Index(['user_name', 'user_location', 'user_description', 'user_created',
       'user_followers', 'user_friends', 'user_favourites', 'user_verified',
       'date', 'text', 'hashtags', 'source', 'is_retweet', 'Sentiment'],
      dtype='object')
In [51]:
from langdetect import detect

text_columns = ['user_name', 'user_location', 'user_description', 'text']
dt['language'] = dt[text_columns].apply(lambda row: detect(' '.join(row.values.astype(str))) if row.notnull().all() else '', axis=1)
dt = dt[dt['language'] == 'en']
dt = dt.drop('language', axis=1)
In [80]:
dt = dt.reset_index(drop=True) #Reset the row numbering
In [81]:
dt.shape
Out[81]:
(19901, 14)
In [89]:
# After eliminating non-english textual data, number of rows has reduced from 20068 to 19901
In [90]:
dt.head()
Out[90]:
user_name user_location user_description user_created user_followers user_friends user_favourites user_verified date text hashtags source is_retweet Sentiment
0 Mari Smith San Diego, California Premier Facebook Marketing Expert | Social Med... 2007-09-11 22:22:51 579942 288625 11610 False 2020-09-16 20:55:33 @musicmadmarc @SocialDilemma_ @netflix @Facebo... NoHashtag Twitter Web App False Neutral
1 Mari Smith San Diego, California Premier Facebook Marketing Expert | Social Med... 2007-09-11 22:22:51 579942 288625 11610 False 2020-09-16 20:53:17 @musicmadmarc @SocialDilemma_ @netflix @Facebo... NoHashtag Twitter Web App False Neutral
2 Varun Tyagi Goa, India Indian | Tech Solution Artist & Hospitality Ex... 2009-09-06 10:36:01 257 204 475 False 2020-09-16 20:51:57 Go watch “The Social Dilemma” on Netflix!\n\nI... NoHashtag Twitter for iPhone False Positive
3 Casey Conway Sydney, New South Wales Head of Diversity & Inclusion @RugbyAU | It's ... 2012-12-28 21:45:06 11782 1033 12219 True 2020-09-16 20:51:46 I watched #TheSocialDilemma last night. I’m sc... ['TheSocialDilemma'] Twitter for iPhone False Negative
4 Charlotte Paul Darlington Instagram Charlottejyates 2012-05-28 20:43:08 278 387 5850 False 2020-09-16 20:51:11 The problem of me being on my phone most the t... ['TheSocialDilemma'] Twitter for iPhone False Positive
In [91]:
blog['text'].head(10)
Out[91]:
0    @musicmadmarc @SocialDilemma_ @netflix @Facebo...
1    @musicmadmarc @SocialDilemma_ @netflix @Facebo...
2    Go watch “The Social Dilemma” on Netflix!\n\nI...
3    I watched #TheSocialDilemma last night. I’m sc...
4    The problem of me being on my phone most the t...
5    #TheSocialDilemma 😳 wow!! We need regulations ...
6    @harari_yuval what do you think about #TheSoci...
7    Erm #TheSocialDilemma makes me want to go off ...
8    #TheSocialDilemma is not a documentary, it's h...
9             Okay i’m watching #TheSocialDilemma now.
Name: text, dtype: object
In [92]:
dt['text'].head(10)
Out[92]:
0    @musicmadmarc @SocialDilemma_ @netflix @Facebo...
1    @musicmadmarc @SocialDilemma_ @netflix @Facebo...
2    Go watch “The Social Dilemma” on Netflix!\n\nI...
3    I watched #TheSocialDilemma last night. I’m sc...
4    The problem of me being on my phone most the t...
5    #TheSocialDilemma 😳 wow!! We need regulations ...
6    Erm #TheSocialDilemma makes me want to go off ...
7    #TheSocialDilemma is not a documentary, it's h...
8             Okay i’m watching #TheSocialDilemma now.
9    Okey okey, I’ve been peer pressured into watch...
Name: text, dtype: object
In [93]:
blog['user_description'].head(10)
Out[93]:
0    Premier Facebook Marketing Expert | Social Med...
1    Premier Facebook Marketing Expert | Social Med...
2    Indian | Tech Solution Artist & Hospitality Ex...
3    Head of Diversity & Inclusion @RugbyAU | It's ...
4                            Instagram Charlottejyates
5                                                  NaN
6    Küçük küçük şeyler söyler, küçük küçük videola...
7    Mother, optimist, feminist, pacifist, retired ...
8    African🌍 | Music🎶 | Lakers🏀|Manchester United ...
9    IG:@RYANWHITEC 💻Digital Content Creator. 97.9 ...
Name: user_description, dtype: object
In [94]:
dt['user_description'].head(10)
Out[94]:
0    Premier Facebook Marketing Expert | Social Med...
1    Premier Facebook Marketing Expert | Social Med...
2    Indian | Tech Solution Artist & Hospitality Ex...
3    Head of Diversity & Inclusion @RugbyAU | It's ...
4                            Instagram Charlottejyates
5                                                     
6    Mother, optimist, feminist, pacifist, retired ...
7    African🌍 | Music🎶 | Lakers🏀|Manchester United ...
8    IG:@RYANWHITEC 💻Digital Content Creator. 97.9 ...
9    Science kid. Herbivore. Opinionated. Tweets ab...
Name: user_description, dtype: object
In [95]:
#In the above example, 6th row was elimnated after considering removal of non-english text data
In [101]:
print('***1.B. Write a custom function to plot the count of unique functions in every column***')
***1.B. Write a custom function to plot the count of unique functions in every column***
In [102]:
import matplotlib.pyplot as plt

def plot_unique_value_counts(df):
    unique_counts = df.nunique()
    column_names = unique_counts.index
    count_values = unique_counts.values

    fig, ax = plt.subplots(figsize=(10, 6))
    ax.bar(column_names, count_values)
    ax.set_xticks(range(len(column_names)))  # Set tick locations
    ax.set_xticklabels(column_names, rotation=90)
    ax.set_xlabel('Columns')
    ax.set_ylabel('Count of Unique Values')
    ax.set_title('Count of Unique Values in Each Column')
    plt.tight_layout()
    plt.show()
In [103]:
plot_unique_value_counts(dt)
In [105]:
print('***1.C. plot for Social Dilemma Sentiment Labels***')
***1.C. plot for Social Dilemma Sentiment Labels***
In [106]:
import matplotlib.pyplot as plt

def plot_unique_counts(df, column_name):
    unique_counts = dt[column_name].value_counts()
    
    plt.figure(figsize=(8, 6))
    plt.bar(unique_counts.index, unique_counts.values)
    plt.xlabel(column_name)
    plt.ylabel('Count')
    plt.title(f'Count of Unique Values in {column_name}')
    plt.show()
In [109]:
plot_unique_counts(dt,'Sentiment')
In [111]:
# The dataset has more positive sentiments compared to negative and neutral sentiments
In [112]:
print('***1.E. Plot and identify the top 20 users, user sources, user locations by number of tweets***')
***1.E. Plot and identify the top 20 users, user sources, user locations by number of tweets***
In [147]:
import plotly.express as px

def plot_top_n(df, column_name, n=20):
    top_n = dt[column_name].value_counts().head(n)

    fig = px.bar(top_n, x=top_n.index, y=top_n.values, labels={column_name: column_name, 'index': 'Number of Tweets'})
    fig.update_layout(title=f'Top {n} {column_name} by Number of Tweets', xaxis_tickangle=-45)
    fig.show()
In [148]:
# Use the above function to identify the top 20 users, user sources, user locations
In [149]:
plot_top_n(dt, 'user_name', n=20)
In [150]:
# User 'OurPact' has highest number of tweets of the top 20
In [151]:
plot_top_n(dt, 'source', n=20)
In [152]:
# User source of 'Twitter for iPhone' has highest number of tweets of the top 20
In [153]:
plot_top_n(dt, 'user_location', n=20)
In [154]:
#The above data is interesting as highest tweet has come from unknown locations. This could be most of the people tweeting would not prefer to share their location.However, the next highest is from India
In [155]:
print("***1.E. Take the top 50 user locations based on no of tweets and try to make the format into city, country for these locations. Incase if only city is present we try to map it to the country from the previous data available***")
***1.E. Take the top 50 user locations based on no of tweets and try to make the format into city, country for these locations. Incase if only city is present we try to map it to the country from the previous data available***
In [157]:
#We can use the geopy library to geocode the locations and retrieve the corresponding city and country information.
In [158]:
!pip install geopy
Collecting geopy
  Downloading geopy-2.3.0-py3-none-any.whl (119 kB)
Collecting geographiclib<3,>=1.52
  Downloading geographiclib-2.0-py3-none-any.whl (40 kB)
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-2.0 geopy-2.3.0
In [159]:
from geopy.geocoders import Nominatim

def format_location(location):
    geolocator = Nominatim(user_agent="my_app")
    
    # If location already in "city, country" format
    if "," in location:
        return location
    
    # Geocode location to get city and country information
    location_info = geolocator.geocode(location, timeout=10)
    
    if location_info is not None:
        city = location_info.raw.get('address', {}).get('city')
        country = location_info.raw.get('address', {}).get('country')
        
        # If city and country are available
        if city and country:
            return f"{city}, {country}"
        
    # If city and country are not available, return the original location
    return location

# Assuming you have a DataFrame called 'df' with a 'user_location' column
top_50_locations = dt['user_location'].value_counts().head(50).index.tolist()

formatted_locations = [format_location(location) for location in top_50_locations]

# Create a new DataFrame with formatted locations
formatted_df = pd.DataFrame({'user_location': formatted_locations})

# Print the formatted DataFrame
print(formatted_df)
                 user_location
0                      Unknown
1                        India
2                Mumbai, India
3              Los Angeles, CA
4              London, England
5                San Diego, CA
6             New Delhi, India
7                       London
8                United States
9                       Canada
10                New York, NY
11                      Mumbai
12            Bengaluru, India
13              United Kingdom
14            Hyderabad, India
15                 Chicago, IL
16                 Pune, India
17                 Atlanta, GA
18             California, USA
19           San Francisco, CA
20                 Los Angeles
21            Toronto, Ontario
22                 Seattle, WA
23              Washington, DC
24               New York, USA
25         Melbourne, Victoria
26                South Africa
27                  Dallas, TX
28                   New Delhi
29                  Boston, MA
30                 Philippines
31  Johannesburg, South Africa
32                Portland, OR
33                       Earth
34              Chennai, India
35                  Austin, TX
36                  Texas, USA
37                         USA
38                       Delhi
39     England, United Kingdom
40              Winchester, VA
41                    New York
42                   Bangalore
43                          UK
44              Nairobi, Kenya
45                 Houston, TX
46                   Australia
47                  London, UK
48     Cape Town, South Africa
49                   Singapore
In [160]:
print('***1.F. Plot the count of tweets from every place identified above.***')
***1.F. Plot the count of tweets from every place identified above.***
In [162]:
# Count the number of tweets for each location
location_counts = formatted_df['user_location'].value_counts()

# Plot the count of tweets for each location
plt.figure(figsize=(12, 6))
location_counts.plot(kind='bar')
plt.xlabel('Location')
plt.ylabel('Number of Tweets')
plt.title('Count of Tweets from Each Place')
plt.xticks(rotation=90)

# Display the actual counts above each bar
for i, count in enumerate(location_counts.values):
    plt.text(i, count, str(count), ha='center', va='bottom')

plt.show()
In [166]:
# The above data shoes top 50 tweets from each identified location. Some are country in therir respective state, some are only country and 1 unknown
In [168]:
print('***1.G. Get the number of Hashtags present in each tweet and plot the distribution of number of hashtags intweet.***')
***1.G. Get the number of Hashtags present in each tweet and plot the distribution of number of hashtags intweet.***
In [172]:
# Get the number of hashtags in each tweet
dt['Count_Hashtags'] = dt['hashtags'].apply(lambda x: len(x.split()) if isinstance(x, str) else 0)

# Plot the distribution of the number of hashtags
plt.figure(figsize=(10, 6))
plt.hist(dt['Count_Hashtags'], bins=range(11))
plt.xlabel('Number of Hashtags')
plt.ylabel('Frequency')
plt.title('Distribution of Number of Hashtags in Tweets')
plt.xticks(range(11))
plt.show()
In [173]:
# From the above dats we could find number of hashtags between 1 to 2 has the largest counts
In [174]:
print('***1.H. Plot the daily and hourly distribution of tweets***')
***1.H. Plot the daily and hourly distribution of tweets***
In [175]:
dt.columns
Out[175]:
Index(['user_name', 'user_location', 'user_description', 'user_created',
       'user_followers', 'user_friends', 'user_favourites', 'user_verified',
       'date', 'text', 'hashtags', 'source', 'is_retweet', 'Sentiment',
       'count_Hashtags', 'Count_Hashtags'],
      dtype='object')
In [188]:
# Convert 'date' column to datetime type
dt['date'] = pd.to_datetime(dt['date'])

# Extract the date and hour information
dt['day'] = dt['date'].dt.date
dt['hour'] = dt['date'].dt.hour

# Calculate the daily and hourly tweet counts
daily_counts = dt['day'].value_counts().sort_index()
hourly_counts = dt['hour'].value_counts().sort_index()

# Plot the daily distribution of tweets
plt.figure(figsize=(12, 6))
daily_counts.plot(kind='line')
plt.xlabel('Date')
plt.ylabel('Number of Tweets')
plt.title('Daily Distribution of Tweets')
plt.xticks(daily_counts.index, daily_counts.index.astype(str), rotation=45)
plt.tight_layout()
plt.show()

# Plot the hourly distribution of tweets
plt.figure(figsize=(12, 6))
hourly_counts.plot(kind='bar')
plt.xlabel('Hour')
plt.ylabel('Number of Tweets')
plt.title('Hourly Distribution of Tweets')
plt.xticks(range(24),rotation=45)
plt.tight_layout()
plt.show()
In [189]:
#The following observation are made from the above graphs:
    #12th Spetember 2020 has highest number of tweeks. 
    #More tweets are active in Spetember only. In OCtober 2020, the tweet counts are lower than ~250
    #Tweets are generally higher betweeb 6 to 8 PM. 
    #Tweets are quite active between 1AM to 3 PM, 2 PM to 10 PM 
In [190]:
print('***I. Identify the number of users created every year and plot the distribution***')
***I. Identify the number of users created every year and plot the distribution***
In [192]:
# Converting 'user_created' column to datetime type
dt['user_created'] = pd.to_datetime(dt['user_created'])

# Extract the year from 'user_created' column
dt['user_created_year'] = dt['user_created'].dt.year

# Count the number of users for each year
users_created_per_year = dt['user_created_year'].value_counts().sort_index()

# Plot the distribution of users created per year
plt.figure(figsize=(12, 6))
plt.bar(users_created_per_year.index, users_created_per_year.values)
plt.xlabel('Year')
plt.ylabel('Number of Users')
plt.title('Distribution of Users Created per Year')

plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
In [193]:
# From the above graph, it is identified that 2009 has highest number if users created
In [194]:
print('***1.J. Find the top 10 hashtags used in the tweet***')
***1.J. Find the top 10 hashtags used in the tweet***
In [197]:
# Extract the hashtags from 'hashtags' column
hashtags = dt['hashtags'].str.lower().str.split(',')

# Flatten the list of hashtags
hashtags_flat = [tag for sublist in hashtags for tag in sublist]

# Calculate the frequency of each hashtag
top_hashtags = pd.Series(hashtags_flat).value_counts().head(10)

# Print the top 10 hashtags
print(top_hashtags)
['thesocialdilemma']    12571
nohashtag                4280
['thesocialdilemma'      1678
 'netflix']               664
 'thesocialdilemma']      639
 'thesocialdilemma'       199
['netflix'                175
 'netflix'                138
 'socialmedia']           121
 'socialmedia'             58
dtype: int64
In [198]:
# From the above analysis, it is noted that 'thesocialdiemna' hashtag has the highest counts. 4280 tweets does not have hashtag
In [199]:
print('***1.K. Get the number of words in each text and plot the distribution of number of words for each class.***')
***1.K. Get the number of words in each text and plot the distribution of number of words for each class.***
In [200]:
# Calculate the number of words in each text
dt['word_count'] = dt['text'].str.split().apply(len)

# Group the texts by class
grouped = dt.groupby('Sentiment')

# Plot the distribution of number of words for each class
plt.figure(figsize=(10, 6))
for sentiment, group in grouped:
    plt.hist(group['word_count'], bins=20, alpha=0.5, label=sentiment)

plt.xlabel('Number of Words')
plt.ylabel('Frequency')
plt.title('Distribution of Number of Words by Sentiment')
plt.legend()
plt.show()
In [202]:
# From the above graph, the following observatinos are noted:
    #Word counts between about 19 to 22 generally tend to have positive sentiment where most of the user tweets are found
    #Word counts below 5 tend to be neutral with negligible contribution of negative sentiment
In [203]:
print('***1.L. Plot the word cloud for negative and positive tweets and write your inferences***')
***1.L. Plot the word cloud for negative and positive tweets and write your inferences***
In [223]:
!pip install plotly wordcloud
Requirement already satisfied: plotly in f:\anaconda3\lib\site-packages (5.6.0)
Requirement already satisfied: wordcloud in f:\anaconda3\lib\site-packages (1.9.2)
Requirement already satisfied: six in f:\anaconda3\lib\site-packages (from plotly) (1.16.0)
Requirement already satisfied: tenacity>=6.2.0 in f:\anaconda3\lib\site-packages (from plotly) (8.0.1)
Requirement already satisfied: matplotlib in f:\anaconda3\lib\site-packages (from wordcloud) (3.5.1)
Requirement already satisfied: pillow in f:\anaconda3\lib\site-packages (from wordcloud) (9.0.1)
Requirement already satisfied: numpy>=1.6.1 in f:\anaconda3\lib\site-packages (from wordcloud) (1.21.5)
Requirement already satisfied: packaging>=20.0 in f:\anaconda3\lib\site-packages (from matplotlib->wordcloud) (21.3)
Requirement already satisfied: fonttools>=4.22.0 in f:\anaconda3\lib\site-packages (from matplotlib->wordcloud) (4.25.0)
Requirement already satisfied: pyparsing>=2.2.1 in f:\anaconda3\lib\site-packages (from matplotlib->wordcloud) (3.0.4)
Requirement already satisfied: python-dateutil>=2.7 in f:\anaconda3\lib\site-packages (from matplotlib->wordcloud) (2.8.2)
Requirement already satisfied: kiwisolver>=1.0.1 in f:\anaconda3\lib\site-packages (from matplotlib->wordcloud) (1.3.2)
Requirement already satisfied: cycler>=0.10 in f:\anaconda3\lib\site-packages (from matplotlib->wordcloud) (0.11.0)
In [255]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import re


# Define a regex pattern to match special characters
pattern = r'[^a-zA-Z0-9\s]'

# Remove special characters from the 'Text' column
dt['text'] = dt['text'].str.replace(pattern, '')
dt['user_description'] = dt['user_description'].str.replace(pattern, '')
dt['hashtags'] = dt['hashtags'].str.replace(pattern, '')

# Download stopwords if not already downloaded
nltk.download('stopwords')
nltk.download('punkt')  # Download the tokenization models

# Filter the positive and negative tweets
positive_tweets = dt[dt['Sentiment'] == 'positive']
negative_tweets = dt[dt['Sentiment'] == 'negative']

# Combine all positive and negative tweets into a single string
positive_text = ' '.join(positive_tweets['text'])
negative_text = ' '.join(negative_tweets['text'])

# Tokenize the positive and negative text
positive_tokens = word_tokenize(positive_text)
negative_tokens = word_tokenize(negative_text)

# Remove stopwords from the tokenized text
stopwords = set(stopwords.words('english'))
unwanted_words = ['word1', 'word2', 'word3']  # Add your unwanted words here
positive_tokens = [token.lower() for token in positive_tokens if token.lower() not in stopwords and token.lower() not in unwanted_words]
negative_tokens = [token.lower() for token in negative_tokens if token.lower() not in stopwords and token.lower() not in unwanted_words]

# Calculate word frequency
positive_word_freq = Counter(positive_tokens)
negative_word_freq = Counter(negative_tokens)

# Get the most frequent words and their frequencies
positive_most_common = positive_word_freq.most_common(20)
negative_most_common = negative_word_freq.most_common(20)

# Extract the words and frequencies if available
positive_words, positive_freq = zip(*positive_most_common) if positive_most_common else ([], [])
negative_words, negative_freq = zip(*negative_most_common) if negative_most_common else ([], [])

# Check if there are any words present before plotting the bar plots
if positive_words and positive_words[0]:
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    sns.barplot(x=positive_freq, y=positive_words)
    plt.title('Most Frequent Words - Positive Tweets')
    plt.xlabel('Frequency')

if negative_words and negative_words[0]:
    if not positive_words or not positive_words[0]:
        plt.figure(figsize=(6, 6))
    plt.subplot(1, 2, 2)
    sns.barplot(x=negative_freq, y=negative_words)
    plt.title('Most Frequent Words - Negative Tweets')
    plt.xlabel('Frequency')

plt.tight_layout()
plt.show()
C:\Users\Richard\AppData\Local\Temp\ipykernel_17056\3271237600.py:15: FutureWarning:

The default value of regex will change from True to False in a future version.

C:\Users\Richard\AppData\Local\Temp\ipykernel_17056\3271237600.py:16: FutureWarning:

The default value of regex will change from True to False in a future version.

C:\Users\Richard\AppData\Local\Temp\ipykernel_17056\3271237600.py:17: FutureWarning:

The default value of regex will change from True to False in a future version.

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Richard\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Richard\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
<Figure size 432x288 with 0 Axes>
In [257]:
# Save DataFrame to CSV
dt.to_csv('G:\\AIML Course Materials\\Projects\\NLP_Additional Project\\blog_dt.csv', index=False)
In [261]:
data=pd.read_csv('G:\\AIML Course Materials\\Projects\\NLP_Additional Project\\blog_dt.csv')
In [262]:
data.shape
Out[262]:
(19901, 20)
In [270]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Download stopwords if not already downloaded
nltk.download('stopwords')
nltk.download('punkt')  # Download the tokenization models

# Filter the positive and negative tweets
positive_tweets = data[data['Sentiment'] == 'positive']
negative_tweets = data[data['Sentiment'] == 'negative']

# Combine all positive and negative tweets into a single string
positive_text = ' '.join(positive_tweets['text'])
negative_text = ' '.join(negative_tweets['text'])

# Tokenize the positive and negative text
positive_tokens = word_tokenize(positive_text)
negative_tokens = word_tokenize(negative_text)

# Remove stopwords from the tokenized text
stopwords = set(stopwords.words('english'))
unwanted_words = ['word1', 'word2', 'word3']  # Add your unwanted words here
positive_tokens = [token.lower() for token in positive_tokens if token.lower() not in stopwords and token.lower() not in unwanted_words]
negative_tokens = [token.lower() for token in negative_tokens if token.lower() not in stopwords and token.lower() not in unwanted_words]

# Calculate word frequency
positive_word_freq = Counter(positive_tokens)
negative_word_freq = Counter(negative_tokens)

# Get the most frequent words and their frequencies
positive_most_common = positive_word_freq.most_common(20)
negative_most_common = negative_word_freq.most_common(20)

# Extract the words and frequencies if available
positive_words, positive_freq = zip(*positive_most_common) if positive_most_common else ([], [])
negative_words, negative_freq = zip(*negative_most_common) if negative_most_common else ([], [])

# Check if there are any words present before plotting the bar plots
if positive_words and positive_words[0]:
    plt.figure(figsize=(50, 6))
    plt.subplot(1, 2, 1)
    sns.barplot(x=positive_freq, y=positive_words)
    plt.title('Most Frequent Words - Positive Tweets')
    plt.xlabel('Frequency')

if negative_words and negative_words[0]:
    if not positive_words or not positive_words[0]:
        plt.figure(figsize=(25, 6))
    plt.subplot(1, 2, 2)
    sns.barplot(x=negative_freq, y=negative_words)
    plt.title('Most Frequent Words - Negative Tweets')
    plt.xlabel('Frequency')

#plt.tight_layout()
plt.show()
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Richard\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Richard\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
In [ ]:
 
In [ ]:
 

2. Data preparation.¶

In [271]:
#Pre-process the data using various techniques and libraries
In [272]:
print('***2.A. Eliminate All special Characters and Numbers***')
***2.A. Eliminate All special Characters and Numbers***
In [274]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19901 entries, 0 to 19900
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   user_name          19901 non-null  object
 1   user_location      19901 non-null  object
 2   user_description   18382 non-null  object
 3   user_created       19901 non-null  object
 4   user_followers     19901 non-null  int64 
 5   user_friends       19901 non-null  int64 
 6   user_favourites    19901 non-null  int64 
 7   user_verified      19901 non-null  bool  
 8   date               19901 non-null  object
 9   text               19901 non-null  object
 10  hashtags           19901 non-null  object
 11  source             19901 non-null  object
 12  is_retweet         19901 non-null  bool  
 13  Sentiment          19901 non-null  object
 14  count_Hashtags     19901 non-null  int64 
 15  Count_Hashtags     19901 non-null  int64 
 16  day                19901 non-null  object
 17  hour               19901 non-null  int64 
 18  user_created_year  19901 non-null  int64 
 19  word_count         19901 non-null  int64 
dtypes: bool(2), int64(8), object(10)
memory usage: 2.8+ MB
In [275]:
data.head()
Out[275]:
user_name user_location user_description user_created user_followers user_friends user_favourites user_verified date text hashtags source is_retweet Sentiment count_Hashtags Count_Hashtags day hour user_created_year word_count
0 Mari Smith San Diego, California Premier Facebook Marketing Expert Social Medi... 2007-09-11 22:22:51 579942 288625 11610 False 2020-09-16 20:55:33 musicmadmarc SocialDilemma netflix Facebook Im... NoHashtag Twitter Web App False Neutral 1 1 2020-09-16 20 2007 16
1 Mari Smith San Diego, California Premier Facebook Marketing Expert Social Medi... 2007-09-11 22:22:51 579942 288625 11610 False 2020-09-16 20:53:17 musicmadmarc SocialDilemma netflix Facebook ha... NoHashtag Twitter Web App False Neutral 1 1 2020-09-16 20 2007 18
2 Varun Tyagi Goa, India Indian Tech Solution Artist Hospitality Expe... 2009-09-06 10:36:01 257 204 475 False 2020-09-16 20:51:57 Go watch The Social Dilemma on Netflix\n\nIts ... NoHashtag Twitter for iPhone False Positive 1 1 2020-09-16 20 2009 20
3 Casey Conway Sydney, New South Wales Head of Diversity Inclusion RugbyAU Its not ... 2012-12-28 21:45:06 11782 1033 12219 True 2020-09-16 20:51:46 I watched TheSocialDilemma last night Im scare... TheSocialDilemma Twitter for iPhone False Negative 1 1 2020-09-16 20 2012 22
4 Charlotte Paul Darlington Instagram Charlottejyates 2012-05-28 20:43:08 278 387 5850 False 2020-09-16 20:51:11 The problem of me being on my phone most the t... TheSocialDilemma Twitter for iPhone False Positive 1 1 2020-09-16 20 2012 17
In [280]:
import re

# Define a regex pattern to match special characters and numbers
pattern = r'[^a-zA-Z\s]|\d+'

data['user_description'] = data['user_description'].astype(str)

# Apply the pattern to remove special characters and numbers from the 'text' column
data['text'] = data['text'].apply(lambda x: re.sub(pattern, '', x))
data['user_description'] = data['user_description'].apply(lambda x: re.sub(pattern, '', x))
data['hashtags'] = data['hashtags'].apply(lambda x: re.sub(pattern, '', x))
data['source'] = data['source'].apply(lambda x: re.sub(pattern, '', x))
data['user_name'] = data['user_name'].apply(lambda x: re.sub(pattern, '', x))
data['user_location'] = data['user_location'].apply(lambda x: re.sub(pattern, '', x))
In [281]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19901 entries, 0 to 19900
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   user_name          19901 non-null  object
 1   user_location      19901 non-null  object
 2   user_description   19901 non-null  object
 3   user_created       19901 non-null  object
 4   user_followers     19901 non-null  int64 
 5   user_friends       19901 non-null  int64 
 6   user_favourites    19901 non-null  int64 
 7   user_verified      19901 non-null  bool  
 8   date               19901 non-null  object
 9   text               19901 non-null  object
 10  hashtags           19901 non-null  object
 11  source             19901 non-null  object
 12  is_retweet         19901 non-null  bool  
 13  Sentiment          19901 non-null  object
 14  count_Hashtags     19901 non-null  int64 
 15  Count_Hashtags     19901 non-null  int64 
 16  day                19901 non-null  object
 17  hour               19901 non-null  int64 
 18  user_created_year  19901 non-null  int64 
 19  word_count         19901 non-null  int64 
dtypes: bool(2), int64(8), object(10)
memory usage: 2.8+ MB
In [282]:
print('***2.B. Remove html tags***')
***2.B. Remove html tags***
In [283]:
data.columns
Out[283]:
Index(['user_name', 'user_location', 'user_description', 'user_created',
       'user_followers', 'user_friends', 'user_favourites', 'user_verified',
       'date', 'text', 'hashtags', 'source', 'is_retweet', 'Sentiment',
       'count_Hashtags', 'Count_Hashtags', 'day', 'hour', 'user_created_year',
       'word_count'],
      dtype='object')
In [284]:
# Define a regular expression pattern to find HTML tags
pattern = r'<[^>]+>'

# Iterate over the object columns to find HTML tags
columns_to_check = ['user_name', 'user_location', 'user_description', 'text', 'hashtags', 'source']
for column in columns_to_check:
    # Apply the regular expression pattern to each value in the column
    tags = data[column].apply(lambda x: re.findall(pattern, str(x)))
    
    # Print the unique HTML tags found in the column
    unique_tags = set([tag for sublist in tags for tag in sublist])
    print(f"HTML tags in column '{column}':")
    for tag in unique_tags:
        print(tag)
    print()
HTML tags in column 'user_name':

HTML tags in column 'user_location':

HTML tags in column 'user_description':

HTML tags in column 'text':

HTML tags in column 'hashtags':

HTML tags in column 'source':

In [289]:
#No html tags identifed. However, proceeding to clean to data
In [285]:
from bs4 import BeautifulSoup

# Remove HTML tags
data['user_location'] = data['user_location'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())
data['user_description'] = data['user_description'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())
data['text'] = data['text'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())
data['hashtags'] = data['hashtags'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())
data['source'] = data['source'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())
In [287]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19901 entries, 0 to 19900
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   user_name          19901 non-null  object
 1   user_location      19901 non-null  object
 2   user_description   19901 non-null  object
 3   user_created       19901 non-null  object
 4   user_followers     19901 non-null  int64 
 5   user_friends       19901 non-null  int64 
 6   user_favourites    19901 non-null  int64 
 7   user_verified      19901 non-null  bool  
 8   date               19901 non-null  object
 9   text               19901 non-null  object
 10  hashtags           19901 non-null  object
 11  source             19901 non-null  object
 12  is_retweet         19901 non-null  bool  
 13  Sentiment          19901 non-null  object
 14  count_Hashtags     19901 non-null  int64 
 15  Count_Hashtags     19901 non-null  int64 
 16  day                19901 non-null  object
 17  hour               19901 non-null  int64 
 18  user_created_year  19901 non-null  int64 
 19  word_count         19901 non-null  int64 
dtypes: bool(2), int64(8), object(10)
memory usage: 2.8+ MB
In [288]:
# Define a regular expression pattern to find HTML tags
pattern = r'<[^>]+>'

# Iterate over the object columns to find HTML tags
columns_to_check = ['user_name', 'user_location', 'user_description', 'text', 'hashtags', 'source']
for column in columns_to_check:
    # Apply the regular expression pattern to each value in the column
    tags = data[column].apply(lambda x: re.findall(pattern, str(x)))
    
    # Print the unique HTML tags found in the column
    unique_tags = set([tag for sublist in tags for tag in sublist])
    print(f"HTML tags in column '{column}':")
    for tag in unique_tags:
        print(tag)
    print()
HTML tags in column 'user_name':

HTML tags in column 'user_location':

HTML tags in column 'user_description':

HTML tags in column 'text':

HTML tags in column 'hashtags':

HTML tags in column 'source':

In [291]:
print('***2.C. Replace contractions in strings e.g. replace Im --> I am) and so on.***')
***2.C. Replace contractions in strings e.g. replace Im --> I am) and so on.***
In [293]:
def replace_contractions(text):
    contraction_mapping = {
    "ain't": "is not",
    "aren't": "are not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "I'd": "I would",
    "I'd've": "I would have",
    "I'll": "I will",
    "I'll've": "I will have",
    "I'm": "I am",
    "I've": "I have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so is",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
    }

    for contraction, expanded_form in contraction_mapping.items():
        text = text.replace(contraction, expanded_form)
    return text

# Iterate over the rows and replace contractions in the specified columns
for index, row in data.iterrows():
    data.at[index, 'user_name'] = replace_contractions(row['user_name'])
    data.at[index, 'user_location'] = replace_contractions(row['user_location'])
    data.at[index, 'user_description'] = replace_contractions(row['user_description'])
    data.at[index, 'text'] = replace_contractions(row['text'])
    data.at[index, 'hashtags'] = replace_contractions(row['hashtags'])
    data.at[index, 'source'] = replace_contractions(row['source'])

# Verify the changes
print(data.head())
        user_name           user_location  \
0      Mari Smith    San Diego California   
1      Mari Smith    San Diego California   
2     Varun Tyagi               Goa India   
3    Casey Conway  Sydney New South Wales   
4  Charlotte Paul              Darlington   

                                    user_description         user_created  \
0  Premier Facebook Marketing Expert  Social Medi...  2007-09-11 22:22:51   
1  Premier Facebook Marketing Expert  Social Medi...  2007-09-11 22:22:51   
2  Indian  Tech Solution Artist  Hospitality Expe...  2009-09-06 10:36:01   
3  Head of Diversity  Inclusion RugbyAU  Its not ...  2012-12-28 21:45:06   
4                          Instagram Charlottejyates  2012-05-28 20:43:08   

   user_followers  user_friends  user_favourites  user_verified  \
0          579942        288625            11610          False   
1          579942        288625            11610          False   
2             257           204              475          False   
3           11782          1033            12219           True   
4             278           387             5850          False   

                  date                                               text  \
0  2020-09-16 20:55:33  musicmadmarc SocialDilemma netflix Facebook Im...   
1  2020-09-16 20:53:17  musicmadmarc SocialDilemma netflix Facebook ha...   
2  2020-09-16 20:51:57  Go watch The Social Dilemma on Netflix\n\nIts ...   
3  2020-09-16 20:51:46  I watched TheSocialDilemma last night Im scare...   
4  2020-09-16 20:51:11  The problem of me being on my phone most the t...   

           hashtags              source  is_retweet Sentiment  count_Hashtags  \
0         NoHashtag     Twitter Web App       False   Neutral               1   
1         NoHashtag     Twitter Web App       False   Neutral               1   
2         NoHashtag  Twitter for iPhone       False  Positive               1   
3  TheSocialDilemma  Twitter for iPhone       False  Negative               1   
4  TheSocialDilemma  Twitter for iPhone       False  Positive               1   

   Count_Hashtags         day  hour  user_created_year  word_count  
0               1  2020-09-16    20               2007          16  
1               1  2020-09-16    20               2007          18  
2               1  2020-09-16    20               2009          20  
3               1  2020-09-16    20               2012          22  
4               1  2020-09-16    20               2012          17  
In [319]:
print(blog['text'].iloc[3])
I watched #TheSocialDilemma last night. I’m scared for humanity. 

I’m not sure what to do but I’ve logged out of F… https://t.co/luOBcjCJFb
In [317]:
print(data['text'].iloc[3])
I watched TheSocialDilemma last night Im scared for humanity 

Im not sure what to do but Ive logged out of F httpstcoluOBcjCJFb
In [326]:
#In the above example, stricng contraction and html tag removal is witnessed after cleaning
In [321]:
print('***2.D. Remove the URL’s***')
***2.D. Remove the URL’s***
In [322]:
# Remove URLs from specific column
data['user_location'] = data['user_location'].apply(lambda x: re.sub(r'http\S+|www\S+|https\S+', '', x))
data['user_description'] = data['user_description'].apply(lambda x: re.sub(r'http\S+|www\S+|https\S+', '', x))
data['text'] = data['text'].apply(lambda x: re.sub(r'http\S+|www\S+|https\S+', '', x))
data['hashtags'] = data['hashtags'].apply(lambda x: re.sub(r'http\S+|www\S+|https\S+', '', x))
data['source'] = data['source'].apply(lambda x: re.sub(r'http\S+|www\S+|https\S+', '', x))
In [323]:
print(blog['text'].iloc[3])
I watched #TheSocialDilemma last night. I’m scared for humanity. 

I’m not sure what to do but I’ve logged out of F… https://t.co/luOBcjCJFb
In [324]:
print(data['text'].iloc[3])
I watched TheSocialDilemma last night Im scared for humanity 

Im not sure what to do but Ive logged out of F 
In [325]:
#In the above example, URL removal is effective after cleaning
In [328]:
print('***2.E. Remove the mentions in the tweets (@)***')
***2.E. Remove the mentions in the tweets (@)***
In [329]:
# Remove the mentions in the twwets'@' from specific column
data['user_location'] = data['user_location'].apply(lambda x: re.sub(r'@\w+', '', x))
data['user_description'] = data['user_description'].apply(lambda x: re.sub(r'@\w+', '', x))
data['text'] = data['text'].apply(lambda x: re.sub(r'@\w+', '', x))
data['hashtags'] = data['hashtags'].apply(lambda x: re.sub(r'@\w+', '', x))
data['source'] = data['source'].apply(lambda x: re.sub(r'@\w+', '', x))
In [330]:
blog.head()
Out[330]:
user_name user_location user_description user_created user_followers user_friends user_favourites user_verified date text hashtags source is_retweet Sentiment
0 Mari Smith San Diego, California Premier Facebook Marketing Expert | Social Med... 2007-09-11 22:22:51 579942 288625 11610 False 2020-09-16 20:55:33 @musicmadmarc @SocialDilemma_ @netflix @Facebo... NaN Twitter Web App False Neutral
1 Mari Smith San Diego, California Premier Facebook Marketing Expert | Social Med... 2007-09-11 22:22:51 579942 288625 11610 False 2020-09-16 20:53:17 @musicmadmarc @SocialDilemma_ @netflix @Facebo... NaN Twitter Web App False Neutral
2 Varun Tyagi Goa, India Indian | Tech Solution Artist & Hospitality Ex... 2009-09-06 10:36:01 257 204 475 False 2020-09-16 20:51:57 Go watch “The Social Dilemma” on Netflix!\n\nI... NaN Twitter for iPhone False Positive
3 Casey Conway Sydney, New South Wales Head of Diversity & Inclusion @RugbyAU | It's ... 2012-12-28 21:45:06 11782 1033 12219 True 2020-09-16 20:51:46 I watched #TheSocialDilemma last night. I’m sc... ['TheSocialDilemma'] Twitter for iPhone False Negative
4 Charlotte Paul Darlington Instagram Charlottejyates 2012-05-28 20:43:08 278 387 5850 False 2020-09-16 20:51:11 The problem of me being on my phone most the t... ['TheSocialDilemma'] Twitter for iPhone False Positive
In [331]:
data.head()
Out[331]:
user_name user_location user_description user_created user_followers user_friends user_favourites user_verified date text hashtags source is_retweet Sentiment count_Hashtags Count_Hashtags day hour user_created_year word_count
0 Mari Smith San Diego California Premier Facebook Marketing Expert Social Medi... 2007-09-11 22:22:51 579942 288625 11610 False 2020-09-16 20:55:33 musicmadmarc SocialDilemma netflix Facebook Im... NoHashtag Twitter Web App False Neutral 1 1 2020-09-16 20 2007 16
1 Mari Smith San Diego California Premier Facebook Marketing Expert Social Medi... 2007-09-11 22:22:51 579942 288625 11610 False 2020-09-16 20:53:17 musicmadmarc SocialDilemma netflix Facebook ha... NoHashtag Twitter Web App False Neutral 1 1 2020-09-16 20 2007 18
2 Varun Tyagi Goa India Indian Tech Solution Artist Hospitality Expe... 2009-09-06 10:36:01 257 204 475 False 2020-09-16 20:51:57 Go watch The Social Dilemma on Netflix\n\nIts ... NoHashtag Twitter for iPhone False Positive 1 1 2020-09-16 20 2009 20
3 Casey Conway Sydney New South Wales Head of Diversity Inclusion RugbyAU Its not ... 2012-12-28 21:45:06 11782 1033 12219 True 2020-09-16 20:51:46 I watched TheSocialDilemma last night Im scare... TheSocialDilemma Twitter for iPhone False Negative 1 1 2020-09-16 20 2012 22
4 Charlotte Paul Darlington Instagram Charlottejyates 2012-05-28 20:43:08 278 387 5850 False 2020-09-16 20:51:11 The problem of me being on my phone most the t... TheSocialDilemma Twitter for iPhone False Positive 1 1 2020-09-16 20 2012 17
In [332]:
#From the above example, it is observed @ has been removed in text column after cleaning
In [333]:
print('***2.F. Remove all Stopwords***')
***2.F. Remove all Stopwords***
In [334]:
import nltk
from nltk.corpus import stopwords

# Download stopwords if not already downloaded
nltk.download('stopwords')

# Remove stopwords from the 'text' column
stopwords = set(stopwords.words('english'))
data['user_description'] = data['user_description'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stopwords]))
data['user_location'] = data['user_location'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stopwords]))
data['text'] = data['text'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stopwords]))
data['hashtags'] = data['hashtags'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stopwords]))
data['source'] = data['source'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stopwords]))
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Richard\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
In [339]:
print(blog['text'].iloc[1])
@musicmadmarc @SocialDilemma_ @netflix @Facebook haa, hey Marc. I get what you're saying &amp; don't agree. 🤪

Whicheve… https://t.co/nsVtPHjUs8
In [340]:
print(data['text'].iloc[1])
musicmadmarc SocialDilemma netflix Facebook haa hey Marc get youre saying amp dont agree Whicheve
In [341]:
# Th above example explains after stopwords I, what removal
In [342]:
print('***2.G. Lowercase all textual data***')
***2.G. Lowercase all textual data***
In [344]:
data['user_name']=data['user_name'].str.lower()
data['user_description']=data['user_description'].str.lower()
data['text']=data['text'].str.lower()
data['hashtags']=data['hashtags'].str.lower()
data['source']=data['source'].str.lower()
In [345]:
print(blog['user_name'].iloc[1])
Mari Smith
In [346]:
print(data['user_name'].iloc[1])
mari smith
In [347]:
print(blog['user_description'].iloc[1])
Premier Facebook Marketing Expert | Social Media Thought Leader | Keynote Speaker | Dynamic Live Video Host | Ambassador | 🇨🇦🏴󠁧󠁢󠁳󠁣󠁴󠁿🇺🇸
In [348]:
print(data['user_description'].iloc[1])
premier facebook marketing expert social media thought leader keynote speaker dynamic live video host ambassador
In [349]:
# From the above examples, it is observed that lower case has been effective after cleaning of data
In [350]:
print('***2.H. Perform tokenization, lemmatization, normalization appropriately***')
***2.H. Perform tokenization, lemmatization, normalization appropriately***
In [358]:
!pip install --upgrade nltk
Requirement already satisfied: nltk in f:\anaconda3\lib\site-packages (3.7)
Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
Requirement already satisfied: tqdm in f:\anaconda3\lib\site-packages (from nltk) (4.64.0)
Requirement already satisfied: joblib in f:\anaconda3\lib\site-packages (from nltk) (1.2.0)
Requirement already satisfied: click in f:\anaconda3\lib\site-packages (from nltk) (8.0.4)
Requirement already satisfied: regex>=2021.8.3 in f:\anaconda3\lib\site-packages (from nltk) (2022.3.15)
Requirement already satisfied: colorama in f:\anaconda3\lib\site-packages (from click->nltk) (0.4.4)
Installing collected packages: nltk
  Attempting uninstall: nltk
    Found existing installation: nltk 3.7
    Uninstalling nltk-3.7:
      Successfully uninstalled nltk-3.7
Successfully installed nltk-3.8.1
In [373]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

# Download required resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()
stopwords_set = set(stopwords.words('english'))

def tokenize_and_lemmatize(text):
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens if token.lower() not in stopwords_set]
    return lemmatized_tokens

# Tokenize and lemmatize the text column
data['text_tokens_Lem'] = data['text'].apply(tokenize_and_lemmatize)

# Tokenize and lemmatize the user_description column
data['user_description_tokens_lem'] = data['user_description'].apply(tokenize_and_lemmatize)

# Print the updated dataset
print(data[['text', 'text_tokens_Lem','user_description','user_description_tokens_lem']].head())
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Richard\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Richard\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Richard\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Richard\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
                                                text  \
0  musicmadmarc socialdilemma netflix facebook im...   
1  musicmadmarc socialdilemma netflix facebook ha...   
2  go watch social dilemma netflix best minutes y...   
3  watched thesocialdilemma last night im scared ...   
4   problem phone time trying watch thesocialdilemma   

                                     text_tokens_Lem  \
0  [musicmadmarc, socialdilemma, netflix, faceboo...   
1  [musicmadmarc, socialdilemma, netflix, faceboo...   
2  [go, watch, social, dilemma, netflix, best, mi...   
3  [watched, thesocialdilemma, last, night, im, s...   
4  [problem, phone, time, trying, watch, thesocia...   

                                    user_description  \
0  premier facebook marketing expert social media...   
1  premier facebook marketing expert social media...   
2  indian tech solution artist hospitality expert...   
3  head diversity inclusion rugbyau tan im aborig...   
4                          instagram charlottejyates   

                         user_description_tokens_lem  
0  [premier, facebook, marketing, expert, social,...  
1  [premier, facebook, marketing, expert, social,...  
2  [indian, tech, solution, artist, hospitality, ...  
3  [head, diversity, inclusion, rugbyau, tan, im,...  
4                       [instagram, charlottejyates]  
In [371]:
# The above code is applied to tokenenize and lemmatize the text data. Now we shall apply normalize on the above data
In [374]:
def normalize_text(tokens):
    normalized_tokens = []
    for token in tokens:
        # Lowercase the token
        normalized_token = token.lower()
        
        # Remove punctuation and special characters
        normalized_token = re.sub(r'[^a-zA-Z0-9]', '', normalized_token)
        
        # Add the normalized token to the list
        normalized_tokens.append(normalized_token)
    
    return normalized_tokens

# Apply text normalization to the 'text_tokens_Lem' column
data['text_normalized'] = data['text_tokens_Lem'].apply(normalize_text)

# Apply text normalization to the 'user_description_tokens_lem' column
data['user_description_normalized'] = data['user_description_tokens_lem'].apply(normalize_text)

# Print the updated dataset
print(data[['text', 'text_normalized','user_description','user_description_normalized']].head())
                                                text  \
0  musicmadmarc socialdilemma netflix facebook im...   
1  musicmadmarc socialdilemma netflix facebook ha...   
2  go watch social dilemma netflix best minutes y...   
3  watched thesocialdilemma last night im scared ...   
4   problem phone time trying watch thesocialdilemma   

                                     text_normalized  \
0  [musicmadmarc, socialdilemma, netflix, faceboo...   
1  [musicmadmarc, socialdilemma, netflix, faceboo...   
2  [go, watch, social, dilemma, netflix, best, mi...   
3  [watched, thesocialdilemma, last, night, im, s...   
4  [problem, phone, time, trying, watch, thesocia...   

                                    user_description  \
0  premier facebook marketing expert social media...   
1  premier facebook marketing expert social media...   
2  indian tech solution artist hospitality expert...   
3  head diversity inclusion rugbyau tan im aborig...   
4                          instagram charlottejyates   

                         user_description_normalized  
0  [premier, facebook, marketing, expert, social,...  
1  [premier, facebook, marketing, expert, social,...  
2  [indian, tech, solution, artist, hospitality, ...  
3  [head, diversity, inclusion, rugbyau, tan, im,...  
4                       [instagram, charlottejyates]  
In [375]:
print('***I. Remove the hashtags***')
***I. Remove the hashtags***
In [378]:
def remove_hashtags(text):
    # Check if the input is a string or bytes-like object
    if isinstance(text, str):
        # Remove hashtags using regular expressions
        text_without_hashtags = re.sub(r'#\w+', '', text)
        return text_without_hashtags
    else:
        # Return empty string if the input is not a string
        return ''

# Apply hashtag removal to the 'text_normalized' column
data['text_without_hashtags'] = data['text_normalized'].apply(lambda x: remove_hashtags(str(x)))

# Apply hashtag removal to the 'user_description_normalized' column
data['user_description_without_hashtags'] = data['user_description_normalized'].apply(lambda x: remove_hashtags(str(x)))
In [380]:
# Print the updated dataset
print(data[['text', 'text_without_hashtags','user_description','user_description_without_hashtags']].head(10))
                                                text  \
0  musicmadmarc socialdilemma netflix facebook im...   
1  musicmadmarc socialdilemma netflix facebook ha...   
2  go watch social dilemma netflix best minutes y...   
3  watched thesocialdilemma last night im scared ...   
4   problem phone time trying watch thesocialdilemma   
5  thesocialdilemma wow need regulations social m...   
6  erm thesocialdilemma makes want go grid live c...   
7           thesocialdilemma documentary horror live   
8                  okay im watching thesocialdilemma   
9  okey okey ive peer pressured watching thesocia...   

                               text_without_hashtags  \
0  ['musicmadmarc', 'socialdilemma', 'netflix', '...   
1  ['musicmadmarc', 'socialdilemma', 'netflix', '...   
2  ['go', 'watch', 'social', 'dilemma', 'netflix'...   
3  ['watched', 'thesocialdilemma', 'last', 'night...   
4  ['problem', 'phone', 'time', 'trying', 'watch'...   
5  ['thesocialdilemma', 'wow', 'need', 'regulatio...   
6  ['erm', 'thesocialdilemma', 'make', 'want', 'g...   
7  ['thesocialdilemma', 'documentary', 'horror', ...   
8     ['okay', 'im', 'watching', 'thesocialdilemma']   
9  ['okey', 'okey', 'ive', 'peer', 'pressured', '...   

                                    user_description  \
0  premier facebook marketing expert social media...   
1  premier facebook marketing expert social media...   
2  indian tech solution artist hospitality expert...   
3  head diversity inclusion rugbyau tan im aborig...   
4                          instagram charlottejyates   
5                                                nan   
6  mother optimist feminist pacifist retired deli...   
7  african music lakersmanchester united dark cho...   
8  igryanwhitec digital content creator beat open...   
9  science kid herbivore opinionated tweets cultu...   

                   user_description_without_hashtags  
0  ['premier', 'facebook', 'marketing', 'expert',...  
1  ['premier', 'facebook', 'marketing', 'expert',...  
2  ['indian', 'tech', 'solution', 'artist', 'hosp...  
3  ['head', 'diversity', 'inclusion', 'rugbyau', ...  
4                   ['instagram', 'charlottejyates']  
5                                            ['nan']  
6  ['mother', 'optimist', 'feminist', 'pacifist',...  
7  ['african', 'music', 'lakersmanchester', 'unit...  
8  ['igryanwhitec', 'digital', 'content', 'creato...  
9  ['science', 'kid', 'herbivore', 'opinionated',...  
In [382]:
print('***3. Build a base Classification model***')
***3. Build a base Classification model***
In [384]:
print('***3.A. Create dependent and independent variables***')
***3.A. Create dependent and independent variables***
In [381]:
data.columns
Out[381]:
Index(['user_name', 'user_location', 'user_description', 'user_created',
       'user_followers', 'user_friends', 'user_favourites', 'user_verified',
       'date', 'text', 'hashtags', 'source', 'is_retweet', 'Sentiment',
       'count_Hashtags', 'Count_Hashtags', 'day', 'hour', 'user_created_year',
       'word_count', 'text_tokens', 'user_description_tokens',
       'text_tokens_Lem', 'user_description_tokens_lem', 'text_normalized',
       'user_description_normalized', 'text_without_hashtags',
       'user_description_without_hashtags'],
      dtype='object')
In [383]:
# Dependent Variable
y = data['Sentiment']

# Independent Variables
x = data[['user_description_without_hashtags','text_without_hashtags']]
In [386]:
print('***3.B. Split data into train and test***')
***3.B. Split data into train and test***
In [387]:
from sklearn.model_selection import train_test_split

# Split the data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
In [388]:
print('***3.C. Vectorize data using any one vectorizer, so that we can feed the data in the model***')
***3.C. Vectorize data using any one vectorizer, so that we can feed the data in the model***
In [389]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the training data
x_train_vectorized = vectorizer.fit_transform(x_train)

# Transform the testing data
x_test_vectorized = vectorizer.transform(x_test)
In [390]:
#Using TFid Vectorizer for traiign the model
In [391]:
print('***D. Build a base model for Supervised Learning - Classification***')
***D. Build a base model for Supervised Learning - Classification***
In [393]:
# Check the dimensions of the train and test sets
print("Training set shape:", x_train.shape, y_train.shape)
print("Testing set shape:", x_test.shape, y_test.shape)
Training set shape: (15920, 2) (15920,)
Testing set shape: (3981, 2) (3981,)
In [435]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Vectorize the text data using CountVectorizer
vectorizer = CountVectorizer()
x_train_vectorized = vectorizer.fit_transform(x_train['text_without_hashtags'])
x_test_vectorized = vectorizer.transform(x_test['text_without_hashtags'])

# Convert the target variable into numeric labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Build the base model using Logistic Regression
model = LogisticRegression(max_iter=1000)

# Train the model
logreg=model.fit(x_train_vectorized, y_train_encoded)

# Make predictions on the test set
y_pred_encoded = model.predict(x_test_vectorized)

# Convert the predicted labels back to original labels
y_pred = label_encoder.inverse_transform(y_pred_encoded)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", round(accuracy,2))
Accuracy: 0.87
In [401]:
#In the base model, the accuracy is 87%.This needs to be imporved further with alternate models or hyper tuning of parameters
In [402]:
print('***3.E. Clearly print Performance Metrics.***')
***3.E. Clearly print Performance Metrics.***
In [404]:
from sklearn.metrics import classification_report

# Make predictions on the test set
y_pred = model.predict(x_test_vectorized)

# Print the classification report
print(classification_report(y_test_encoded, y_pred))
              precision    recall  f1-score   support

           0       0.86      0.69      0.76       660
           1       0.84      0.95      0.89      1405
           2       0.91      0.89      0.90      1916

    accuracy                           0.87      3981
   macro avg       0.87      0.84      0.85      3981
weighted avg       0.88      0.87      0.87      3981

In [405]:
y_train.value_counts()
Out[405]:
Positive    7541
Neutral     5482
Negative    2897
Name: Sentiment, dtype: int64
In [406]:
#The following observations are made based on the above report:
    # Recall & f1-score are low for Positive Sentiment
    # Overall score in neutral 7 negative sentiments are above 85%. However, this could be imporved with hyper-tuning/alterante models
    # Average in all 3 classes are above 80%. However, we shall try to find opportunities to improve further.
In [408]:
print('***3.F. Improve performance of the model and mention which parameter/hyperparameter significantly helped to improve performance and its probable reason***')
***3.F. Improve performance of the model and mention which parameter/hyperparameter significantly helped to improve performance and its probable reason***
In [415]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# Define the hyperparameters and their possible values
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization strength
    'penalty': ['l1', 'l2'],  # Regularization penalty type
    'solver': ['liblinear', 'saga']  # Optimization algorithm
}

# Create the logistic regression model
model = LogisticRegression(max_iter=2000)

# Perform grid search cross-validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(x_train_vectorized, y_train_encoded)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Use the best hyperparameters to train the final model
best_model = LogisticRegression(**best_params)
best_model.fit(x_train_vectorized, y_train_encoded)

# Evaluate the model on the test set
y_pred_gridCV = best_model.predict(x_test_vectorized)
print("Performance Metrics:")
print(classification_report(y_test_encoded, y_pred_gridCV))
F:\anaconda3\lib\site-packages\sklearn\linear_model\_sag.py:352: ConvergenceWarning:

The max_iter was reached which means the coef_ did not converge

F:\anaconda3\lib\site-packages\sklearn\linear_model\_sag.py:352: ConvergenceWarning:

The max_iter was reached which means the coef_ did not converge

F:\anaconda3\lib\site-packages\sklearn\linear_model\_sag.py:352: ConvergenceWarning:

The max_iter was reached which means the coef_ did not converge

F:\anaconda3\lib\site-packages\sklearn\linear_model\_sag.py:352: ConvergenceWarning:

The max_iter was reached which means the coef_ did not converge

F:\anaconda3\lib\site-packages\sklearn\linear_model\_sag.py:352: ConvergenceWarning:

The max_iter was reached which means the coef_ did not converge

F:\anaconda3\lib\site-packages\sklearn\linear_model\_sag.py:352: ConvergenceWarning:

The max_iter was reached which means the coef_ did not converge

F:\anaconda3\lib\site-packages\sklearn\linear_model\_sag.py:352: ConvergenceWarning:

The max_iter was reached which means the coef_ did not converge

F:\anaconda3\lib\site-packages\sklearn\linear_model\_sag.py:352: ConvergenceWarning:

The max_iter was reached which means the coef_ did not converge

F:\anaconda3\lib\site-packages\sklearn\linear_model\_sag.py:352: ConvergenceWarning:

The max_iter was reached which means the coef_ did not converge

F:\anaconda3\lib\site-packages\sklearn\linear_model\_sag.py:352: ConvergenceWarning:

The max_iter was reached which means the coef_ did not converge

Best Hyperparameters: {'C': 1, 'penalty': 'l1', 'solver': 'saga'}
Performance Metrics:
              precision    recall  f1-score   support

           0       0.88      0.72      0.79       660
           1       0.85      0.97      0.91      1405
           2       0.93      0.90      0.91      1916

    accuracy                           0.89      3981
   macro avg       0.89      0.86      0.87      3981
weighted avg       0.89      0.89      0.89      3981

F:\anaconda3\lib\site-packages\sklearn\linear_model\_sag.py:352: ConvergenceWarning:

The max_iter was reached which means the coef_ did not converge

In [ ]:
 
In [417]:
print('***3.G. Try at least three different models and identify which model performs best. Print and plot Confusion matirx to get an idea of how the distribution of the prediction is among all the classes. Write your inferences on the same.***')
***3.G. Try at least three different models and identify which model performs best. Print and plot Confusion matirx to get an idea of how the distribution of the prediction is among all the classes. Write your inferences on the same.***
In [418]:
from sklearn.svm import SVC

# Create the SVM model
svm = SVC()

# Train the model
svm.fit(x_train_vectorized, y_train_encoded)

# Predict on the test set
y_pred_svm = svm.predict(x_test_vectorized)

# Evaluate the model
print("SVM Performance Metrics:")
print(classification_report(y_test_encoded, y_pred_svm))
SVM Performance Metrics:
              precision    recall  f1-score   support

           0       0.88      0.51      0.64       660
           1       0.78      0.97      0.86      1405
           2       0.89      0.86      0.87      1916

    accuracy                           0.84      3981
   macro avg       0.85      0.78      0.79      3981
weighted avg       0.85      0.84      0.83      3981

In [419]:
from sklearn.naive_bayes import MultinomialNB

# Create the Naive Bayes model
naive_bayes = MultinomialNB()

# Train the model
naive_bayes.fit(x_train_vectorized, y_train_encoded)

# Predict on the test set
y_pred_naive_bayes = naive_bayes.predict(x_test_vectorized)

# Evaluate the model
print("Naive Bayes Performance Metrics:")
print(classification_report(y_test_encoded, y_pred_naive_bayes))
Naive Bayes Performance Metrics:
              precision    recall  f1-score   support

           0       0.78      0.45      0.57       660
           1       0.85      0.70      0.77      1405
           2       0.72      0.92      0.81      1916

    accuracy                           0.77      3981
   macro avg       0.79      0.69      0.72      3981
weighted avg       0.78      0.77      0.76      3981

In [420]:
from sklearn.ensemble import RandomForestClassifier

# Create the Random Forest model
random_forest = RandomForestClassifier()

# Train the model
random_forest.fit(x_train_vectorized, y_train_encoded)

# Predict on the test set
y_pred_random_forest = random_forest.predict(x_test_vectorized)

# Evaluate the model
print("Random Forest Performance Metrics:")
print(classification_report(y_test_encoded, y_pred_random_forest))
Random Forest Performance Metrics:
              precision    recall  f1-score   support

           0       0.87      0.52      0.66       660
           1       0.80      0.96      0.87      1405
           2       0.88      0.87      0.87      1916

    accuracy                           0.84      3981
   macro avg       0.85      0.78      0.80      3981
weighted avg       0.85      0.84      0.84      3981

In [ ]:
 
In [ ]:
 
In [421]:
print('***3.H. Wordcloud of top 40 important features from the final model chosen***')
***3.H. Wordcloud of top 40 important features from the final model chosen***
In [ ]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import numpy as np

from sklearn.inspection import permutation_importance

result = permutation_importance(logreg, x_test_vectorized.toarray(), y_test_encoded, n_repeats=10, random_state=42)
feature_importance_scores = result.importances_mean

# Sort the features based on their importance scores
sorted_features = sorted(feature_importance_scores.items(), key=lambda x: x[1], reverse=True)

# Select the top 40 features
top_features = dict(sorted_features[:40])

# Generate a word cloud visualization
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(top_features)

# Display the word cloud
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
In [ ]: